
## This file takes results from a scraper of the DOJ website
# split into main and archive (which is the way the site presents)
# and generates a universe of false claims act medicare Press Releases

setwd("/Users/jetson/Dropbox (MIT)/MITEcon/Research/FalseClaimsAct/2019/DOJScraper")

####  Main PRs

d<-read.csv("PRList.csv", header = FALSE)

## Clean 
colnames(d)<-c("date", "text")
d$text<-gsub("[^[:alnum:]]", " ", d$text)
d$text <-tolower(d$text)

nrow(d)
# Subset for FCA medicare
d<-d[grep('false claims', d$text),]
nrow(d)
d<-d[grep('medicare', d$text),]
d<-d[grep('qui tam|whistleblower', d$text),]

nrow(d)

#### Archive Prs
#setwd("/Users/jetson/Dropbox (MIT)/MITEcon/Research/FalseClaimsAct/DOJScraper")

e<-read.csv("archivePRList.csv", header = FALSE)
nrow(e)
colnames(e)<-c("date", "text")

## Remove some bad date strings
e$date<-gsub("/index-archive.html", "", e$date)


#clean text
e$text <- trimws(e$text)
e$text<-gsub("[^[:alnum:]]", " ", e$text)
e$text <-tolower(e$text)
e$text <- gsub("( )+", " ", e$text)


# Subset for FCA medicare
nrow(e)
e<-e[grep('false claims', e$text),]
nrow(e)
e<-e[grep('medicare', e$text),]
e<-e[grep('qui tam|whistleblower', e$text),]
nrow(e)




Data<- rbind(d, e)


### Clean up  dates

# Delete days of week
Data$date<-gsub("Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday", "", Data$date)
# Trim whistepace from front and end
Data$date<-trimws(Data$date)
# Shrink spaces
Data$date<-gsub("[[:space:]]+", " ", Data$date)
Dates<-strsplit(Data$date, " ")

# If we split on spaces, 3rd element; otherwise first 4
Data$Year<-sapply(Dates, function(x) if(length(x) == 3){x[3]} else substr(x, 1, 4))
## If we just grabbed "Pre_", take year off the end
Data$Year[Data$Year == "Pre_"] <- paste("19", sapply(Dates[Data$Year =="Pre_"], function(x) substr(x,(nchar(x)+1)-2,nchar(x))), sep = "")

# Get months
Data$Month <- sapply(Dates, function(x) if(length(x) == 3){x[1]} else strsplit(x,'/')[[1]][2])
Data$Month<-gsub("[[:digit:]]+", "", Data$Month)
Data$Month[Data$Month == "Jun"] <- "June"

# Create yearmonth
Data$YearMon <- paste(Data$Year, Data$Month, sep = "-")
library(zoo)
Data$YearMonth<-as.yearmon(Data$YearMon, "%Y-%b")

#Sort by yearmonth
Data<-Data[order(Data$YearMonth),]
## Remove stray columns
Data<-within(Data, rm(YearMon, date))




## Coerce text to string from factor
Data$text <- as.character(Data$text)

### Remove landing pages 
Data<-Data[-grep("read more", Data$text),]

## Add unique ID
Data$PRID <- seq.int(nrow(Data))

## Remove common issues
Data$text<-gsub("( )+", " ", Data$text)
Data$text<-gsub("doctype html public w3c dtd xhtml 1 0 transitional en http www w3 org tr xhtml1 dtd xhtml1 transitional dtd"," ", Data$text)
Data$text<-gsub("span property dc title content", " ", Data$text)

write.csv(Data, "PRUniverse.csv", row.names = FALSE)




